None
Utilizing Ensemble Machine Learning Techniques
by: John-Eric Bonilla, MSDA
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=Warning)
df = pd.read_csv(r'titanic.csv')
df.head()
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1.0 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0.0 | 0.0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
| 1 | 1.0 | 1.0 | Allison, Master. Hudson Trevor | male | 0.9167 | 1.0 | 2.0 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
| 2 | 1.0 | 0.0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1.0 | 2.0 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
| 3 | 1.0 | 0.0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1.0 | 2.0 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
| 4 | 1.0 | 0.0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1.0 | 2.0 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
df.dtypes
pclass float64 survived float64 name object sex object age float64 sibsp float64 parch float64 ticket object fare float64 cabin object embarked object boat object body float64 home.dest object dtype: object
df.isnull().sum()
pclass 1 survived 1 name 1 sex 1 age 264 sibsp 1 parch 1 ticket 1 fare 2 cabin 1015 embarked 3 boat 824 body 1189 home.dest 565 dtype: int64
The Age Attribute
df['age'].fillna(df['age'].mean(),inplace=True)
df.head(10)
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1.0 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0.0 | 0.0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
| 1 | 1.0 | 1.0 | Allison, Master. Hudson Trevor | male | 0.9167 | 1.0 | 2.0 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
| 2 | 1.0 | 0.0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1.0 | 2.0 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
| 3 | 1.0 | 0.0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1.0 | 2.0 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
| 4 | 1.0 | 0.0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1.0 | 2.0 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
| 5 | 1.0 | 1.0 | Anderson, Mr. Harry | male | 48.0000 | 0.0 | 0.0 | 19952 | 26.5500 | E12 | S | 3 | NaN | New York, NY |
| 6 | 1.0 | 1.0 | Andrews, Miss. Kornelia Theodosia | female | 63.0000 | 1.0 | 0.0 | 13502 | 77.9583 | D7 | S | 10 | NaN | Hudson, NY |
| 7 | 1.0 | 0.0 | Andrews, Mr. Thomas Jr | male | 39.0000 | 0.0 | 0.0 | 112050 | 0.0000 | A36 | S | NaN | NaN | Belfast, NI |
| 8 | 1.0 | 1.0 | Appleton, Mrs. Edward Dale (Charlotte Lamson) | female | 53.0000 | 2.0 | 0.0 | 11769 | 51.4792 | C101 | S | D | NaN | Bayside, Queens, NY |
| 9 | 1.0 | 0.0 | Artagaveytia, Mr. Ramon | male | 71.0000 | 0.0 | 0.0 | PC 17609 | 49.5042 | NaN | C | NaN | 22.0 | Montevideo, Uruguay |
The 'SibSp' and 'Parch' Attributes (Siblings & Spouse)
for i, col in enumerate(['sibsp', 'parch']):
plt.figure(i)
sns.catplot(x=col, y = 'survived', data = df, kind = 'point', aspect=2, )
<Figure size 432x288 with 0 Axes>
df['family_size'] = df['sibsp'] + df['parch']
df.drop(['sibsp', 'parch', 'body', 'boat'], axis=1, inplace=True)
df.head()
| pclass | survived | name | sex | age | ticket | fare | cabin | embarked | home.dest | family_size | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1.0 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 24160 | 211.3375 | B5 | S | St Louis, MO | 0.0 |
| 1 | 1.0 | 1.0 | Allison, Master. Hudson Trevor | male | 0.9167 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 |
| 2 | 1.0 | 0.0 | Allison, Miss. Helen Loraine | female | 2.0000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 |
| 3 | 1.0 | 0.0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 |
| 4 | 1.0 | 0.0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 |
df.isnull().sum()
pclass 1 survived 1 name 1 sex 1 age 0 ticket 1 fare 2 cabin 1015 embarked 3 home.dest 565 family_size 1 dtype: int64
df.groupby(df['cabin'].isnull())['survived'].mean()
cabin False 0.654237 True 0.302761 Name: survived, dtype: float64
df['has_cabin'] = np.where(df['cabin'].isnull(), 0, 1)
df.head(30)
| pclass | survived | name | sex | age | ticket | fare | cabin | embarked | home.dest | family_size | has_cabin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1.0 | Allen, Miss. Elisabeth Walton | female | 29.000000 | 24160 | 211.3375 | B5 | S | St Louis, MO | 0.0 | 1 |
| 1 | 1.0 | 1.0 | Allison, Master. Hudson Trevor | male | 0.916700 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 | 1 |
| 2 | 1.0 | 0.0 | Allison, Miss. Helen Loraine | female | 2.000000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 | 1 |
| 3 | 1.0 | 0.0 | Allison, Mr. Hudson Joshua Creighton | male | 30.000000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 | 1 |
| 4 | 1.0 | 0.0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.000000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 | 1 |
| 5 | 1.0 | 1.0 | Anderson, Mr. Harry | male | 48.000000 | 19952 | 26.5500 | E12 | S | New York, NY | 0.0 | 1 |
| 6 | 1.0 | 1.0 | Andrews, Miss. Kornelia Theodosia | female | 63.000000 | 13502 | 77.9583 | D7 | S | Hudson, NY | 1.0 | 1 |
| 7 | 1.0 | 0.0 | Andrews, Mr. Thomas Jr | male | 39.000000 | 112050 | 0.0000 | A36 | S | Belfast, NI | 0.0 | 1 |
| 8 | 1.0 | 1.0 | Appleton, Mrs. Edward Dale (Charlotte Lamson) | female | 53.000000 | 11769 | 51.4792 | C101 | S | Bayside, Queens, NY | 2.0 | 1 |
| 9 | 1.0 | 0.0 | Artagaveytia, Mr. Ramon | male | 71.000000 | PC 17609 | 49.5042 | NaN | C | Montevideo, Uruguay | 0.0 | 0 |
| 10 | 1.0 | 0.0 | Astor, Col. John Jacob | male | 47.000000 | PC 17757 | 227.5250 | C62 C64 | C | New York, NY | 1.0 | 1 |
| 11 | 1.0 | 1.0 | Astor, Mrs. John Jacob (Madeleine Talmadge Force) | female | 18.000000 | PC 17757 | 227.5250 | C62 C64 | C | New York, NY | 1.0 | 1 |
| 12 | 1.0 | 1.0 | Aubart, Mme. Leontine Pauline | female | 24.000000 | PC 17477 | 69.3000 | B35 | C | Paris, France | 0.0 | 1 |
| 13 | 1.0 | 1.0 | Barber, Miss. Ellen "Nellie" | female | 26.000000 | 19877 | 78.8500 | NaN | S | NaN | 0.0 | 0 |
| 14 | 1.0 | 1.0 | Barkworth, Mr. Algernon Henry Wilson | male | 80.000000 | 27042 | 30.0000 | A23 | S | Hessle, Yorks | 0.0 | 1 |
| 15 | 1.0 | 0.0 | Baumann, Mr. John D | male | 29.881135 | PC 17318 | 25.9250 | NaN | S | New York, NY | 0.0 | 0 |
| 16 | 1.0 | 0.0 | Baxter, Mr. Quigg Edmond | male | 24.000000 | PC 17558 | 247.5208 | B58 B60 | C | Montreal, PQ | 1.0 | 1 |
| 17 | 1.0 | 1.0 | Baxter, Mrs. James (Helene DeLaudeniere Chaput) | female | 50.000000 | PC 17558 | 247.5208 | B58 B60 | C | Montreal, PQ | 1.0 | 1 |
| 18 | 1.0 | 1.0 | Bazzani, Miss. Albina | female | 32.000000 | 11813 | 76.2917 | D15 | C | NaN | 0.0 | 1 |
| 19 | 1.0 | 0.0 | Beattie, Mr. Thomson | male | 36.000000 | 13050 | 75.2417 | C6 | C | Winnipeg, MN | 0.0 | 1 |
| 20 | 1.0 | 1.0 | Beckwith, Mr. Richard Leonard | male | 37.000000 | 11751 | 52.5542 | D35 | S | New York, NY | 2.0 | 1 |
| 21 | 1.0 | 1.0 | Beckwith, Mrs. Richard Leonard (Sallie Monypeny) | female | 47.000000 | 11751 | 52.5542 | D35 | S | New York, NY | 2.0 | 1 |
| 22 | 1.0 | 1.0 | Behr, Mr. Karl Howell | male | 26.000000 | 111369 | 30.0000 | C148 | C | New York, NY | 0.0 | 1 |
| 23 | 1.0 | 1.0 | Bidois, Miss. Rosalie | female | 42.000000 | PC 17757 | 227.5250 | NaN | C | NaN | 0.0 | 0 |
| 24 | 1.0 | 1.0 | Bird, Miss. Ellen | female | 29.000000 | PC 17483 | 221.7792 | C97 | S | NaN | 0.0 | 1 |
| 25 | 1.0 | 0.0 | Birnbaum, Mr. Jakob | male | 25.000000 | 13905 | 26.0000 | NaN | C | San Francisco, CA | 0.0 | 0 |
| 26 | 1.0 | 1.0 | Bishop, Mr. Dickinson H | male | 25.000000 | 11967 | 91.0792 | B49 | C | Dowagiac, MI | 1.0 | 1 |
| 27 | 1.0 | 1.0 | Bishop, Mrs. Dickinson H (Helen Walton) | female | 19.000000 | 11967 | 91.0792 | B49 | C | Dowagiac, MI | 1.0 | 1 |
| 28 | 1.0 | 1.0 | Bissette, Miss. Amelia | female | 35.000000 | PC 17760 | 135.6333 | C99 | S | NaN | 0.0 | 1 |
| 29 | 1.0 | 1.0 | Bjornstrom-Steffansson, Mr. Mauritz Hakan | male | 28.000000 | 110564 | 26.5500 | C52 | S | Stockholm, Sweden / Washington, DC | 0.0 | 1 |
gender_map = {'male': 0, 'female': 1}
df['sex'] = df['sex'].map(gender_map)
df.head()
| pclass | survived | name | sex | age | ticket | fare | cabin | embarked | home.dest | family_size | has_cabin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1.0 | Allen, Miss. Elisabeth Walton | 1.0 | 29.0000 | 24160 | 211.3375 | B5 | S | St Louis, MO | 0.0 | 1 |
| 1 | 1.0 | 1.0 | Allison, Master. Hudson Trevor | 0.0 | 0.9167 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 | 1 |
| 2 | 1.0 | 0.0 | Allison, Miss. Helen Loraine | 1.0 | 2.0000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 | 1 |
| 3 | 1.0 | 0.0 | Allison, Mr. Hudson Joshua Creighton | 0.0 | 30.0000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 | 1 |
| 4 | 1.0 | 0.0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | 1.0 | 25.0000 | 113781 | 151.5500 | C22 C26 | S | Montreal, PQ / Chesterville, ON | 3.0 | 1 |
df.groupby(df['embarked'].isnull())['survived'].mean()
embarked False 0.381025 True 1.000000 Name: survived, dtype: float64
df.groupby(df['home.dest'].isnull())['survived'].mean()
home.dest False 0.465772 True 0.271277 Name: survived, dtype: float64
df.drop(['cabin', 'embarked', 'name', 'ticket', 'home.dest'], axis=1, inplace=True)
df.head()
| pclass | survived | sex | age | fare | family_size | has_cabin | |
|---|---|---|---|---|---|---|---|
| 0 | 1.0 | 1.0 | 1.0 | 29.0000 | 211.3375 | 0.0 | 1 |
| 1 | 1.0 | 1.0 | 0.0 | 0.9167 | 151.5500 | 3.0 | 1 |
| 2 | 1.0 | 0.0 | 1.0 | 2.0000 | 151.5500 | 3.0 | 1 |
| 3 | 1.0 | 0.0 | 0.0 | 30.0000 | 151.5500 | 3.0 | 1 |
| 4 | 1.0 | 0.0 | 1.0 | 25.0000 | 151.5500 | 3.0 | 1 |
df.isnull().sum()
pclass 1 survived 1 sex 1 age 0 fare 2 family_size 1 has_cabin 0 dtype: int64
df.ffill(axis = 0, inplace=True)
df.to_csv('cleaned_titanic.csv')
df.describe
<bound method NDFrame.describe of pclass survived sex age fare family_size has_cabin 0 1.0 1.0 1.0 29.000000 211.3375 0.0 1 1 1.0 1.0 0.0 0.916700 151.5500 3.0 1 2 1.0 0.0 1.0 2.000000 151.5500 3.0 1 3 1.0 0.0 0.0 30.000000 151.5500 3.0 1 4 1.0 0.0 1.0 25.000000 151.5500 3.0 1 ... ... ... ... ... ... ... ... 1305 3.0 0.0 1.0 29.881135 14.4542 1.0 0 1306 3.0 0.0 0.0 26.500000 7.2250 0.0 0 1307 3.0 0.0 0.0 27.000000 7.2250 0.0 0 1308 3.0 0.0 0.0 29.000000 7.8750 0.0 0 1309 3.0 0.0 0.0 29.881135 7.8750 0.0 0 [1310 rows x 7 columns]>
features=df.columns
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Box( y=df[features[1]]),row=1,col=1)
fig.add_trace(go.Histogram(x=df[features[1]]),row=1,col=2)
fig.update_layout(height=600, title_text="Boxplot and histogram for feature: "+str(features[1]))
fig.update_layout(showlegend=False)
fig.show()
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Box( y=df[features[2]]),row=1,col=1)
fig.add_trace(go.Histogram(x=df[features[2]]),row=1,col=2)
fig.update_layout(height=600, title_text="Boxplot and histogram for feature: "+str(features[2]))
fig.update_layout(showlegend=False)
fig.show()
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Box( y=df[features[3]]),row=1,col=1)
fig.add_trace(go.Histogram(x=df[features[3]]),row=1,col=2)
fig.update_layout(height=600, title_text="Boxplot and histogram for feature: "+str(features[3]))
fig.update_layout(showlegend=False)
fig.show()
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Box( y=df[features[4]]),row=1,col=1)
fig.add_trace(go.Histogram(x=df[features[4]]),row=1,col=2)
fig.update_layout(height=600, title_text="Boxplot and histogram for feature: "+str(features[4]))
fig.update_layout(showlegend=False)
fig.show()
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Box( y=df[features[5]]),row=1,col=1)
fig.add_trace(go.Histogram(x=df[features[5]]),row=1,col=2)
fig.update_layout(height=600, title_text="Boxplot and histogram for feature: "+str(features[5]))
fig.update_layout(showlegend=False)
fig.show()
fig = make_subplots(rows=1, cols=2)
fig.add_trace(go.Box( y=df[features[6]]),row=1,col=1)
fig.add_trace(go.Histogram(x=df[features[6]]),row=1,col=2)
fig.update_layout(height=600, title_text="Boxplot and histogram for feature: "+str(features[6]))
fig.update_layout(showlegend=False)
fig.show()
from sklearn.model_selection import train_test_split
df = pd.read_csv(r'cleaned_titanic.csv')
df.head()
| Unnamed: 0 | pclass | survived | sex | age | fare | family_size | has_cabin | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 1.0 | 1.0 | 1.0 | 29.0000 | 211.3375 | 0.0 | 1 |
| 1 | 1 | 1.0 | 1.0 | 0.0 | 0.9167 | 151.5500 | 3.0 | 1 |
| 2 | 2 | 1.0 | 0.0 | 1.0 | 2.0000 | 151.5500 | 3.0 | 1 |
| 3 | 3 | 1.0 | 0.0 | 0.0 | 30.0000 | 151.5500 | 3.0 | 1 |
| 4 | 4 | 1.0 | 0.0 | 1.0 | 25.0000 | 151.5500 | 3.0 | 1 |
features = df.drop('survived', axis=1)
target = df['survived']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state = 1965)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state = 1965)
for dataset in [y_train, y_val, y_test]:
print(round(len(dataset) / len(target), 2))
0.6 0.2 0.2
X_train.to_csv('train_features.csv', index = False)
X_val.to_csv('val_features.csv', index = False)
X_test.to_csv('test_features.csv', index = False)
y_train.to_csv('train_target.csv', index=False)
y_val.to_csv('val_target.csv', index=False)
y_test.to_csv('test_target.csv', index=False)
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
import joblib
from sklearn.model_selection import GridSearchCV
#Examine all the accessible hyperparameters
GradientBoostingClassifier().get_params()
{'ccp_alpha': 0.0,
'criterion': 'friedman_mse',
'init': None,
'learning_rate': 0.1,
'loss': 'deviance',
'max_depth': 3,
'max_features': None,
'max_leaf_nodes': None,
'min_impurity_decrease': 0.0,
'min_impurity_split': None,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_iter_no_change': None,
'presort': 'deprecated',
'random_state': None,
'subsample': 1.0,
'tol': 0.0001,
'validation_fraction': 0.1,
'verbose': 0,
'warm_start': False}
#Examine all the accessible hyperparameters
AdaBoostClassifier().get_params()
{'algorithm': 'SAMME.R',
'base_estimator': None,
'learning_rate': 1.0,
'n_estimators': 50,
'random_state': None}
tr_features = pd.read_csv(r'train_features.csv')
tr_target = pd.read_csv(r'train_target.csv')
tr_features.shape
(786, 7)
tr_target.shape
(786, 1)
tr_target.isnull().sum()
survived 0 dtype: int64
tr_features.isnull().sum()
Unnamed: 0 0 pclass 0 sex 0 age 0 fare 0 family_size 0 has_cabin 0 dtype: int64
def print_results(results):
print('The best hyperparameter are: {}\n'.format(results.best_params_))
means = results.cv_results_['mean_test_score']
stds = results.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, results.cv_results_['params']):
print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))
gb = GradientBoostingClassifier()
parameters = {
'n_estimators': [5, 50, 250, 500],
'max_depth': [1, 3, 5, 7, 9],
'learning_rate': [0.01, 0.1, 1, 10, 100]
}
cv = GridSearchCV(gb, parameters, cv=5)
cv.fit(tr_features, tr_target.values.ravel()) # ravel changes column to an array
print_results(cv)
The best hyperparameter are: {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 250}
0.627 (+/-0.006) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 5}
0.726 (+/-0.113) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 50}
0.766 (+/-0.073) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 250}
0.783 (+/-0.06) for {'learning_rate': 0.01, 'max_depth': 1, 'n_estimators': 500}
0.627 (+/-0.006) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 5}
0.791 (+/-0.035) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 50}
0.813 (+/-0.023) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 250}
0.816 (+/-0.015) for {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 500}
0.627 (+/-0.006) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 5}
0.789 (+/-0.039) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 50}
0.817 (+/-0.028) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 250}
0.819 (+/-0.035) for {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500}
0.627 (+/-0.006) for {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 5}
0.808 (+/-0.025) for {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 50}
0.822 (+/-0.019) for {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 250}
0.816 (+/-0.008) for {'learning_rate': 0.01, 'max_depth': 7, 'n_estimators': 500}
0.627 (+/-0.006) for {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 5}
0.796 (+/-0.056) for {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 50}
0.809 (+/-0.017) for {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 250}
0.817 (+/-0.025) for {'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 500}
0.726 (+/-0.113) for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 5}
0.784 (+/-0.049) for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 50}
0.786 (+/-0.04) for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 250}
0.79 (+/-0.059) for {'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 500}
0.791 (+/-0.035) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 5}
0.812 (+/-0.022) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
0.807 (+/-0.031) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 250}
0.799 (+/-0.062) for {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
0.788 (+/-0.041) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 5}
0.817 (+/-0.032) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 50}
0.804 (+/-0.035) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 250}
0.804 (+/-0.021) for {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500}
0.808 (+/-0.025) for {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 5}
0.812 (+/-0.026) for {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 50}
0.805 (+/-0.043) for {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 250}
0.799 (+/-0.039) for {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 500}
0.796 (+/-0.053) for {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 5}
0.805 (+/-0.024) for {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 50}
0.789 (+/-0.042) for {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 250}
0.799 (+/-0.03) for {'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 500}
0.788 (+/-0.07) for {'learning_rate': 1, 'max_depth': 1, 'n_estimators': 5}
0.788 (+/-0.033) for {'learning_rate': 1, 'max_depth': 1, 'n_estimators': 50}
0.785 (+/-0.035) for {'learning_rate': 1, 'max_depth': 1, 'n_estimators': 250}
0.78 (+/-0.026) for {'learning_rate': 1, 'max_depth': 1, 'n_estimators': 500}
0.793 (+/-0.03) for {'learning_rate': 1, 'max_depth': 3, 'n_estimators': 5}
0.77 (+/-0.023) for {'learning_rate': 1, 'max_depth': 3, 'n_estimators': 50}
0.779 (+/-0.019) for {'learning_rate': 1, 'max_depth': 3, 'n_estimators': 250}
0.789 (+/-0.047) for {'learning_rate': 1, 'max_depth': 3, 'n_estimators': 500}
0.771 (+/-0.039) for {'learning_rate': 1, 'max_depth': 5, 'n_estimators': 5}
0.786 (+/-0.013) for {'learning_rate': 1, 'max_depth': 5, 'n_estimators': 50}
0.8 (+/-0.036) for {'learning_rate': 1, 'max_depth': 5, 'n_estimators': 250}
0.795 (+/-0.028) for {'learning_rate': 1, 'max_depth': 5, 'n_estimators': 500}
0.791 (+/-0.026) for {'learning_rate': 1, 'max_depth': 7, 'n_estimators': 5}
0.796 (+/-0.039) for {'learning_rate': 1, 'max_depth': 7, 'n_estimators': 50}
0.803 (+/-0.023) for {'learning_rate': 1, 'max_depth': 7, 'n_estimators': 250}
0.768 (+/-0.03) for {'learning_rate': 1, 'max_depth': 7, 'n_estimators': 500}
0.78 (+/-0.053) for {'learning_rate': 1, 'max_depth': 9, 'n_estimators': 5}
0.804 (+/-0.036) for {'learning_rate': 1, 'max_depth': 9, 'n_estimators': 50}
0.782 (+/-0.044) for {'learning_rate': 1, 'max_depth': 9, 'n_estimators': 250}
0.78 (+/-0.035) for {'learning_rate': 1, 'max_depth': 9, 'n_estimators': 500}
0.234 (+/-0.073) for {'learning_rate': 10, 'max_depth': 1, 'n_estimators': 5}
0.234 (+/-0.073) for {'learning_rate': 10, 'max_depth': 1, 'n_estimators': 50}
0.234 (+/-0.073) for {'learning_rate': 10, 'max_depth': 1, 'n_estimators': 250}
0.234 (+/-0.073) for {'learning_rate': 10, 'max_depth': 1, 'n_estimators': 500}
0.332 (+/-0.301) for {'learning_rate': 10, 'max_depth': 3, 'n_estimators': 5}
0.325 (+/-0.31) for {'learning_rate': 10, 'max_depth': 3, 'n_estimators': 50}
0.325 (+/-0.31) for {'learning_rate': 10, 'max_depth': 3, 'n_estimators': 250}
0.325 (+/-0.31) for {'learning_rate': 10, 'max_depth': 3, 'n_estimators': 500}
0.444 (+/-0.12) for {'learning_rate': 10, 'max_depth': 5, 'n_estimators': 5}
0.441 (+/-0.114) for {'learning_rate': 10, 'max_depth': 5, 'n_estimators': 50}
0.445 (+/-0.115) for {'learning_rate': 10, 'max_depth': 5, 'n_estimators': 250}
0.443 (+/-0.117) for {'learning_rate': 10, 'max_depth': 5, 'n_estimators': 500}
0.59 (+/-0.098) for {'learning_rate': 10, 'max_depth': 7, 'n_estimators': 5}
0.543 (+/-0.136) for {'learning_rate': 10, 'max_depth': 7, 'n_estimators': 50}
0.582 (+/-0.109) for {'learning_rate': 10, 'max_depth': 7, 'n_estimators': 250}
0.547 (+/-0.137) for {'learning_rate': 10, 'max_depth': 7, 'n_estimators': 500}
0.635 (+/-0.104) for {'learning_rate': 10, 'max_depth': 9, 'n_estimators': 5}
0.629 (+/-0.109) for {'learning_rate': 10, 'max_depth': 9, 'n_estimators': 50}
0.63 (+/-0.105) for {'learning_rate': 10, 'max_depth': 9, 'n_estimators': 250}
0.617 (+/-0.12) for {'learning_rate': 10, 'max_depth': 9, 'n_estimators': 500}
0.373 (+/-0.006) for {'learning_rate': 100, 'max_depth': 1, 'n_estimators': 5}
0.373 (+/-0.006) for {'learning_rate': 100, 'max_depth': 1, 'n_estimators': 50}
0.373 (+/-0.006) for {'learning_rate': 100, 'max_depth': 1, 'n_estimators': 250}
0.373 (+/-0.006) for {'learning_rate': 100, 'max_depth': 1, 'n_estimators': 500}
0.282 (+/-0.056) for {'learning_rate': 100, 'max_depth': 3, 'n_estimators': 5}
0.275 (+/-0.033) for {'learning_rate': 100, 'max_depth': 3, 'n_estimators': 50}
0.282 (+/-0.056) for {'learning_rate': 100, 'max_depth': 3, 'n_estimators': 250}
0.275 (+/-0.033) for {'learning_rate': 100, 'max_depth': 3, 'n_estimators': 500}
0.419 (+/-0.122) for {'learning_rate': 100, 'max_depth': 5, 'n_estimators': 5}
0.427 (+/-0.125) for {'learning_rate': 100, 'max_depth': 5, 'n_estimators': 50}
0.419 (+/-0.123) for {'learning_rate': 100, 'max_depth': 5, 'n_estimators': 250}
0.42 (+/-0.122) for {'learning_rate': 100, 'max_depth': 5, 'n_estimators': 500}
0.482 (+/-0.067) for {'learning_rate': 100, 'max_depth': 7, 'n_estimators': 5}
0.424 (+/-0.075) for {'learning_rate': 100, 'max_depth': 7, 'n_estimators': 50}
0.421 (+/-0.089) for {'learning_rate': 100, 'max_depth': 7, 'n_estimators': 250}
0.452 (+/-0.046) for {'learning_rate': 100, 'max_depth': 7, 'n_estimators': 500}
0.555 (+/-0.141) for {'learning_rate': 100, 'max_depth': 9, 'n_estimators': 5}
0.608 (+/-0.079) for {'learning_rate': 100, 'max_depth': 9, 'n_estimators': 50}
0.534 (+/-0.143) for {'learning_rate': 100, 'max_depth': 9, 'n_estimators': 250}
0.564 (+/-0.118) for {'learning_rate': 100, 'max_depth': 9, 'n_estimators': 500}
cv.best_estimator_
GradientBoostingClassifier(learning_rate=0.01, max_depth=7, n_estimators=250)
joblib.dump(cv.best_estimator_, 'model_gb.pkl')
['model_gb.pkl']
from sklearn.ensemble import RandomForestClassifier
#Examine all the accessible hyperparameters
RandomForestClassifier().get_params()
{'bootstrap': True,
'ccp_alpha': 0.0,
'class_weight': None,
'criterion': 'gini',
'max_depth': None,
'max_features': 'auto',
'max_leaf_nodes': None,
'max_samples': None,
'min_impurity_decrease': 0.0,
'min_impurity_split': None,
'min_samples_leaf': 1,
'min_samples_split': 2,
'min_weight_fraction_leaf': 0.0,
'n_estimators': 100,
'n_jobs': None,
'oob_score': False,
'random_state': None,
'verbose': 0,
'warm_start': False}
def print_results(results):
print('The best hyperparameter are: {}\n'.format(results.best_params_))
means = results.cv_results_['mean_test_score']
stds = results.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, results.cv_results_['params']):
print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))
rf = RandomForestClassifier()
parameters = {
'n_estimators': [5, 50, 250, 500, 1000],
'max_depth': [4, 8, 16, 32, None]
}
cv = GridSearchCV(rf, parameters, cv=5)
cv.fit(tr_features, tr_target.values.ravel()) # ravel changes column vector to array
print_results(cv)
The best hyperparameter are: {'max_depth': 8, 'n_estimators': 250}
0.784 (+/-0.074) for {'max_depth': 4, 'n_estimators': 5}
0.794 (+/-0.057) for {'max_depth': 4, 'n_estimators': 50}
0.798 (+/-0.049) for {'max_depth': 4, 'n_estimators': 250}
0.803 (+/-0.049) for {'max_depth': 4, 'n_estimators': 500}
0.802 (+/-0.051) for {'max_depth': 4, 'n_estimators': 1000}
0.793 (+/-0.067) for {'max_depth': 8, 'n_estimators': 5}
0.812 (+/-0.052) for {'max_depth': 8, 'n_estimators': 50}
0.818 (+/-0.034) for {'max_depth': 8, 'n_estimators': 250}
0.817 (+/-0.037) for {'max_depth': 8, 'n_estimators': 500}
0.814 (+/-0.031) for {'max_depth': 8, 'n_estimators': 1000}
0.802 (+/-0.061) for {'max_depth': 16, 'n_estimators': 5}
0.807 (+/-0.05) for {'max_depth': 16, 'n_estimators': 50}
0.81 (+/-0.049) for {'max_depth': 16, 'n_estimators': 250}
0.81 (+/-0.034) for {'max_depth': 16, 'n_estimators': 500}
0.807 (+/-0.045) for {'max_depth': 16, 'n_estimators': 1000}
0.782 (+/-0.059) for {'max_depth': 32, 'n_estimators': 5}
0.802 (+/-0.055) for {'max_depth': 32, 'n_estimators': 50}
0.817 (+/-0.033) for {'max_depth': 32, 'n_estimators': 250}
0.803 (+/-0.041) for {'max_depth': 32, 'n_estimators': 500}
0.808 (+/-0.048) for {'max_depth': 32, 'n_estimators': 1000}
0.763 (+/-0.081) for {'max_depth': None, 'n_estimators': 5}
0.807 (+/-0.055) for {'max_depth': None, 'n_estimators': 50}
0.812 (+/-0.043) for {'max_depth': None, 'n_estimators': 250}
0.807 (+/-0.049) for {'max_depth': None, 'n_estimators': 500}
0.807 (+/-0.043) for {'max_depth': None, 'n_estimators': 1000}
cv.best_estimator_
RandomForestClassifier(max_depth=8, n_estimators=250)
joblib.dump(cv.best_estimator_, 'model_rf.pkl')
['model_rf.pkl']
from sklearn.ensemble import StackingClassifier
#Examine all the accessible hyperparameters
estimators = [('gb',GradientBoostingClassifier()), ('rf', RandomForestClassifier())]
StackingClassifier(estimators = estimators).get_params()
{'cv': None,
'estimators': [('gb', GradientBoostingClassifier()),
('rf', RandomForestClassifier())],
'final_estimator': None,
'n_jobs': None,
'passthrough': False,
'stack_method': 'auto',
'verbose': 0,
'gb': GradientBoostingClassifier(),
'rf': RandomForestClassifier(),
'gb__ccp_alpha': 0.0,
'gb__criterion': 'friedman_mse',
'gb__init': None,
'gb__learning_rate': 0.1,
'gb__loss': 'deviance',
'gb__max_depth': 3,
'gb__max_features': None,
'gb__max_leaf_nodes': None,
'gb__min_impurity_decrease': 0.0,
'gb__min_impurity_split': None,
'gb__min_samples_leaf': 1,
'gb__min_samples_split': 2,
'gb__min_weight_fraction_leaf': 0.0,
'gb__n_estimators': 100,
'gb__n_iter_no_change': None,
'gb__presort': 'deprecated',
'gb__random_state': None,
'gb__subsample': 1.0,
'gb__tol': 0.0001,
'gb__validation_fraction': 0.1,
'gb__verbose': 0,
'gb__warm_start': False,
'rf__bootstrap': True,
'rf__ccp_alpha': 0.0,
'rf__class_weight': None,
'rf__criterion': 'gini',
'rf__max_depth': None,
'rf__max_features': 'auto',
'rf__max_leaf_nodes': None,
'rf__max_samples': None,
'rf__min_impurity_decrease': 0.0,
'rf__min_impurity_split': None,
'rf__min_samples_leaf': 1,
'rf__min_samples_split': 2,
'rf__min_weight_fraction_leaf': 0.0,
'rf__n_estimators': 100,
'rf__n_jobs': None,
'rf__oob_score': False,
'rf__random_state': None,
'rf__verbose': 0,
'rf__warm_start': False}
from sklearn.linear_model import LogisticRegression
def print_results(results):
print('The optimal hyper parameters for this model are: {}\n'.format(results.best_params_))
means = results.cv_results_['mean_test_score']
stds = results.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, results.cv_results_['params']):
print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))
estimators = [('rf', RandomForestClassifier()),
('gb', GradientBoostingClassifier())]
sc = StackingClassifier(estimators=estimators)
sc.get_params()
{'cv': None,
'estimators': [('rf', RandomForestClassifier()),
('gb', GradientBoostingClassifier())],
'final_estimator': None,
'n_jobs': None,
'passthrough': False,
'stack_method': 'auto',
'verbose': 0,
'rf': RandomForestClassifier(),
'gb': GradientBoostingClassifier(),
'rf__bootstrap': True,
'rf__ccp_alpha': 0.0,
'rf__class_weight': None,
'rf__criterion': 'gini',
'rf__max_depth': None,
'rf__max_features': 'auto',
'rf__max_leaf_nodes': None,
'rf__max_samples': None,
'rf__min_impurity_decrease': 0.0,
'rf__min_impurity_split': None,
'rf__min_samples_leaf': 1,
'rf__min_samples_split': 2,
'rf__min_weight_fraction_leaf': 0.0,
'rf__n_estimators': 100,
'rf__n_jobs': None,
'rf__oob_score': False,
'rf__random_state': None,
'rf__verbose': 0,
'rf__warm_start': False,
'gb__ccp_alpha': 0.0,
'gb__criterion': 'friedman_mse',
'gb__init': None,
'gb__learning_rate': 0.1,
'gb__loss': 'deviance',
'gb__max_depth': 3,
'gb__max_features': None,
'gb__max_leaf_nodes': None,
'gb__min_impurity_decrease': 0.0,
'gb__min_impurity_split': None,
'gb__min_samples_leaf': 1,
'gb__min_samples_split': 2,
'gb__min_weight_fraction_leaf': 0.0,
'gb__n_estimators': 100,
'gb__n_iter_no_change': None,
'gb__presort': 'deprecated',
'gb__random_state': None,
'gb__subsample': 1.0,
'gb__tol': 0.0001,
'gb__validation_fraction': 0.1,
'gb__verbose': 0,
'gb__warm_start': False}
parameters = {
'gb__n_estimators': [50, 100, 250],
'rf__n_estimators': [50, 100, 250],
'final_estimator': [LogisticRegression(C=0.1),
LogisticRegression(C=1),
LogisticRegression(C=10),
LogisticRegression(C=50),],
'passthrough': [True, False]
}
cv = GridSearchCV(sc, parameters, cv=5)
cv.fit(tr_features, tr_target.values.ravel())
print_results(cv)
The optimal hyper parameters for this model are: {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 250}
0.804 (+/-0.046) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 50}
0.804 (+/-0.038) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 100}
0.8 (+/-0.046) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 250}
0.819 (+/-0.03) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 50}
0.816 (+/-0.023) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 100}
0.816 (+/-0.029) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 250}
0.805 (+/-0.047) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 50}
0.799 (+/-0.04) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 100}
0.804 (+/-0.041) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 250}
0.823 (+/-0.039) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 50}
0.826 (+/-0.039) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 100}
0.817 (+/-0.034) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 250}
0.81 (+/-0.036) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 50}
0.802 (+/-0.036) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 100}
0.807 (+/-0.037) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 250}
0.814 (+/-0.04) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 50}
0.819 (+/-0.049) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 100}
0.823 (+/-0.036) for {'final_estimator': LogisticRegression(C=0.1), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 250}
0.8 (+/-0.034) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 50}
0.809 (+/-0.036) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 100}
0.803 (+/-0.037) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 250}
0.819 (+/-0.025) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 50}
0.817 (+/-0.022) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 100}
0.818 (+/-0.013) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 250}
0.812 (+/-0.036) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 50}
0.804 (+/-0.046) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 100}
0.803 (+/-0.043) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 250}
0.826 (+/-0.027) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 50}
0.824 (+/-0.021) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 100}
0.826 (+/-0.023) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 250}
0.804 (+/-0.04) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 50}
0.807 (+/-0.038) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 100}
0.805 (+/-0.04) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 250}
0.817 (+/-0.038) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 50}
0.817 (+/-0.041) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 100}
0.817 (+/-0.037) for {'final_estimator': LogisticRegression(C=1), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 250}
0.803 (+/-0.036) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 50}
0.807 (+/-0.041) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 100}
0.805 (+/-0.048) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 250}
0.814 (+/-0.033) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 50}
0.812 (+/-0.029) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 100}
0.813 (+/-0.026) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 250}
0.805 (+/-0.034) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 50}
0.803 (+/-0.044) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 100}
0.81 (+/-0.045) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 250}
0.821 (+/-0.03) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 50}
0.819 (+/-0.033) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 100}
0.817 (+/-0.032) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 250}
0.813 (+/-0.032) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 50}
0.809 (+/-0.029) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 100}
0.807 (+/-0.04) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 250}
0.817 (+/-0.036) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 50}
0.819 (+/-0.031) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 100}
0.814 (+/-0.025) for {'final_estimator': LogisticRegression(C=10), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 250}
0.804 (+/-0.044) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 50}
0.802 (+/-0.04) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 100}
0.805 (+/-0.04) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 50, 'passthrough': True, 'rf__n_estimators': 250}
0.808 (+/-0.029) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 50}
0.812 (+/-0.029) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 100}
0.814 (+/-0.03) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 50, 'passthrough': False, 'rf__n_estimators': 250}
0.804 (+/-0.04) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 50}
0.805 (+/-0.04) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 100}
0.804 (+/-0.034) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 100, 'passthrough': True, 'rf__n_estimators': 250}
0.818 (+/-0.034) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 50}
0.819 (+/-0.03) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 100}
0.817 (+/-0.028) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 100, 'passthrough': False, 'rf__n_estimators': 250}
0.807 (+/-0.034) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 50}
0.81 (+/-0.025) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 100}
0.809 (+/-0.024) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 250, 'passthrough': True, 'rf__n_estimators': 250}
0.817 (+/-0.042) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 50}
0.813 (+/-0.032) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 100}
0.813 (+/-0.032) for {'final_estimator': LogisticRegression(C=50), 'gb__n_estimators': 250, 'passthrough': False, 'rf__n_estimators': 250}
joblib.dump(cv.best_estimator_, 'stacked_mod.pkl')
['stacked_mod.pkl']
# Import standard performance libraries
from sklearn.metrics import accuracy_score, precision_score, recall_score
from time import time
BG_Model = joblib.load('model_gb.pkl')
RF_Model = joblib.load('model_rf.pkl')
STACKED_Model = joblib.load('stacked_mod.pkl')
val_features = pd.read_csv('val_features.csv')
val_target = pd.read_csv('val_target.csv')
def evaluate_model(model, features, target):
start = time()
pred = model.predict(features)
end = time()
accuracy = round(accuracy_score(target, pred), 3)
precision = round(precision_score(target, pred), 3)
recall = round(recall_score(target, pred), 3)
print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(str(model).split('(')[0],
accuracy,
precision,
recall,
round((end - start)*1000, 1)))
for mdl in [BG_Model, RF_Model, STACKED_Model]:
evaluate_model(mdl, val_features, val_target)
GradientBoostingClassifier -- Accuracy: 0.798 / Precision: 0.802 / Recall: 0.676 / Latency: 3.0ms RandomForestClassifier -- Accuracy: 0.779 / Precision: 0.784 / Recall: 0.639 / Latency: 21.0ms StackingClassifier -- Accuracy: 0.786 / Precision: 0.783 / Recall: 0.667 / Latency: 24.0ms
test_features = pd.read_csv('test_features.csv')
test_target = pd.read_csv('test_target.csv')
evaluate_model(BG_Model, test_features, test_target)
GradientBoostingClassifier -- Accuracy: 0.84 / Precision: 0.843 / Recall: 0.707 / Latency: 3.0ms
* The GB model perfomed even better on the test set then the validation set.